package com.ml.room.terrace.process;

import cn.hutool.core.collection.CollectionUtil;
import com.alibaba.fastjson.JSON;
import com.ml.room.dao.IBuildingRecordDao;
import com.ml.room.repository.entity.BuildingRecord;
import org.apache.commons.lang3.StringUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.stereotype.Component;
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.processor.PageProcessor;
import us.codecraft.webmagic.selector.Selectable;

import java.util.*;

/**
 * @ClassName: BuildingRecordProcess
 * @Decription: 【云房平台】 - 房屋备案 - 数据抽取
 * @Author: IDai
 * @Date: 2021-08-13 14:50 星期五
 **/
@Component
public class BuildingRecordPageProcess implements PageProcessor {
    @Autowired
    private IBuildingRecordDao buildingRecordDao;

    /** 日志打印工具 **/
    private static final Logger logger = LoggerFactory.getLogger(BuildingRecordPageProcess.class);

    public static final String PREFIX_URL = "http://220.178.124.94:8010/fangjia/ws/";

    //设置抓取参数。详细配置见官方文档介绍 抓取网站的相关配置，包括编码、抓取间隔、重试次数等
    private final Site site = Site.me().setRetryTimes(3).setSleepTime(1000).setTimeOut(10000);
    //.setUserAgent("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_2) AppleWebKit/537.31 (KHTML, like Gecko) Chrome/26.0.1410.65 Safari/537.31");

    @Override
    public Site getSite() {
        return site;
    }

    @Override
    public void process(Page page) {
        logger.info("【合肥新房】 - 房屋备案 - 数据抽取，入参page：{}", JSON.toJSONString(page));
        Set<BuildingRecord> buildingRecords = new LinkedHashSet<>();
        // 解析页面获取房屋备案列表的所有table下所有tr标签
        List<Selectable> trContents = page.getHtml().css("table.tbl tr").nodes();
        // 判断获取到的集合是否为空
        if (CollectionUtil.isNotEmpty(trContents)){
            //使用lambda表达式过滤表头
            trContents.stream().filter(trSelectable ->
                StringUtils.equals(trSelectable.css("td","align").toString(),"center")&&StringUtils.isNotBlank(trSelectable.css("td a[href]","text").toString())
            ).forEach(trSelectable -> {
                //获取单个tr下的所有td然后再解析数据存入数据库实体中
                List<Selectable> tdContents = trSelectable.css("td").nodes();
                if (CollectionUtil.isNotEmpty(tdContents)){
                    BuildingRecord buildingRecord = new BuildingRecord();
                    String remarkNo = tdContents.get(0).css("a","text").toString();
                    //根据房屋备案号判断数据库中是否已有该条数据
                    int result = buildingRecordDao.isHaveThisBuildingRecord(remarkNo);
                    if (result == 0){
                        //无此条数据的话，数据进入待保存
                        buildingRecord.setRemarkNo(remarkNo);
                        buildingRecord.setInfoUrl(PREFIX_URL + tdContents.get(0).css("a","href").toString());
                        buildingRecord.setBuildingName(tdContents.get(1).css("a","text").toString());
                        buildingRecord.setBuildingNo(tdContents.get(2).css("td","text").toString());
                        buildingRecord.setBuildingArea(tdContents.get(3).css("td","text").toString());
                        buildingRecord.setSetNumber(Integer.parseInt(tdContents.get(4).css("td","text").toString().trim()));
                        buildingRecord.setAveragePrice(tdContents.get(5).css("td","text").toString());
                        buildingRecords.add(buildingRecord);
                    }
                }
            });
            boolean isRepeat = false;
            if (CollectionUtil.isEmpty(buildingRecords)){
                isRepeat = true;
            }
            page.putField("isRepeat", isRepeat);
            page.putField("buildingRecords", buildingRecords);
        }
        logger.info("【合肥新房】 - 房屋备案 - 数据抽取，出参page：{}", JSON.toJSONString(buildingRecords));
    }
}
